#importing all necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib
%matplotlib inline
color = sns.color_palette()
from IPython.display import display
pd.options.display.max_columns = None
# Standard plotly imports
import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
# Using plotly + cufflinks in offline mode
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import cufflinks as cf
cf.set_config_file(offline=True)
import cufflinks
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)
#ML Libraries from sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
#loading the data set and checking whether it has got loaded correctly
dataFrame=pd.read_csv('D:\GLALMIWorkSpace\Module1\Week4\diabetes.csv')
dataFrame.head()
dataFrame.describe(include="all") #checking for stats
There are missing values in the dataset. This is evident from the fact that minimum values are indicated as zero in Insulin/BMI/Glucose/BP/SkinThickness.We also get to know that the data set has got the patients that fall in age between 21 and 81.
#checking out the percentage of people having diabetes in the data set
f, ax = plt.subplots(1, 2, figsize = (10, 7))
_ = dataFrame.Outcome.value_counts().plot.bar(ax = ax[0], rot = 0).set(xticklabels = ["Healthy", "Diabetic"])
_ = dataFrame.Outcome.value_counts().plot.pie(labels = ("Healthy", "Diabetic"), autopct = "%.2f%%", label = "", fontsize = 13., ax = ax[1],wedgeprops = {"linewidth": 1.5, "edgecolor": "#F7F7F7"}), ax[1].texts[1].set_color("#F7F7F7"), ax[1].texts[3].set_color("#F7F7F7")
By looking here,we get to know the density of outliers present in the data and the skewness too
plt.style.use('ggplot') # Using ggplot2 style visuals for box plot
f, ax = plt.subplots(figsize=(11, 15))
ax.set_facecolor('#fafafa')
ax.set(xlim=(-.05, 200))
plt.ylabel('Column Values')
ax = sns.boxplot(data = dataFrame,
orient = 'h',
palette = 'Set2')
#function to use z score to identify the outliers for columns that require to be analyzed-just another way that clarifies
#that our data set has got zeroes
outliers=[]
def detect_outlier(data):
threshold=3
mean = np.mean(data)
std =np.std(data)
for y in data:
z_score= (y - mean)/std
if np.abs(z_score) > threshold:
outliers.append(y)
return outliers
#example to show on recognizing outliers for columns
outlier_datapoints = detect_outlier(dataFrame['BMI'])
print(outlier_datapoints)
#copying the original data set to a new dataframe for further operations
dataFrame_copy = dataFrame.copy(deep = True)
dataFrame_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = dataFrame_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
## showing the count of Nans
print(dataFrame_copy.isnull().sum())
#checking on the median for each outcome
data_median = dataFrame[['Age','BMI','SkinThickness','Outcome']].groupby(['Outcome'])[['Age','BMI','SkinThickness']].median()
data_median
We can see that are a lot of values that are having zero as their values. This is not good as we need the values to analyze the data further.
#best way is to have mean/median being filled as the values that have NaN from previous steps.
dataFrame_copy['Glucose'].fillna(dataFrame_copy['Glucose'].mean(), inplace = True)
dataFrame_copy['BloodPressure'].fillna(dataFrame_copy['BloodPressure'].mean(), inplace = True)
dataFrame_copy['SkinThickness'].fillna(dataFrame_copy['SkinThickness'].median(), inplace = True)
dataFrame_copy['Insulin'].fillna(dataFrame_copy['Insulin'].median(), inplace = True)
dataFrame_copy['BMI'].fillna(dataFrame_copy['BMI'].median(), inplace = True)
#getting the count of healthy and diabetic people for visual analysis ease
healthy=dataFrame_copy[dataFrame_copy['Outcome']==0]
diabetic=dataFrame_copy[dataFrame_copy['Outcome']==1]
healthy.describe()
diabetic.describe()
#comparing all the description values in terms of their distribituion in Probability Density Functions
f, axes = plt.subplots(2, 4,figsize=(20,10))
count=0
for i in range(2):
for j in range(4):
sns.distplot(healthy.iloc[:,count],kde=True,hist=False,kde_kws = {'linewidth': 3},label = 'Healthy',ax = axes[i][j])
sns.distplot(diabetic.iloc[:,count],kde=True,hist=False,kde_kws = {'linewidth': 3},label = 'Diabetic',ax = axes[i][j])
axes[i][j].legend()
count=count+1
f.suptitle('Comparison in the Density of Values for Healthy and Diabetic Classes in DataSet')
plt.show()
#plotly plots to check on how the distribution curve looks like for insulin across both classes
#--do not run this cell with the credentials provided
plotly.tools.set_credentials_file(username='surajnairgangadharan', api_key='6fIEjU3P5TST6LEHsOmH')
x1 = healthy['Insulin']
x2 = diabetic['Insulin']
hist_data = [x1, x2]
group_labels = ['Healthy', 'Diabetic']
fig = ff.create_distplot(hist_data, group_labels,
curve_type='kde', show_hist=True, show_rug=True,bin_size=0)
fig['layout'].update(title='Insulin Level for Healthy v/s Diabetic')
py.iplot(fig)
#BMI comparision for healthy versus diabetic --do not run this cell with the credentials provided
plotly.tools.set_credentials_file(username='surajnairgangadharan', api_key='6fIEjU3P5TST6LEHsOmH')
x1 = healthy['BMI']
x2 = diabetic['BMI']
hist_data = [x1, x2]
group_labels = ['Healthy', 'Diabetic']
fig = ff.create_distplot(hist_data, group_labels,
curve_type='kde', show_hist=True, show_rug=True,bin_size=0)
fig['layout'].update(title='BMI distribution for Healthy v/s Diabetic')
py.iplot(fig)
#checking the distribution for each column after replacement of 0 with respective mesaure of central tendency values
dataFrame_copy.hist(figsize = (10,10))
#understanding the relation that the columns have with each other
sns.pairplot(dataFrame_copy,hue="Outcome")
#correlation analysis
corr = dataFrame_copy.corr()
sns.heatmap(corr, annot=True)
sns.lmplot(x="SkinThickness",y="BMI",hue="Outcome",data=dataFrame_copy)
#grouping the age values to check the number of diabetic patients in each group
age_group=[]
for i in dataFrame['Age']:
if ((i>=21) & (i<=31)):
age_group.append('Young(21-31)')
elif((i>31) & (i<=51)):
age_group.append('Middle Age(32-51)')
else:
age_group.append('Old(52 and Above)')
dataFrame_copy['AgeGroup']=age_group
dataFrame_copy['AgeGroup'].value_counts(ascending=False).plot(kind= 'bar',figsize=(5,5))
sns.countplot(x="Outcome", hue="AgeGroup", data=dataFrame_copy)
Shows that young people are at a lesser risk for diabetes
sns.countplot(x="Outcome", hue="Pregnancies", data=dataFrame_copy)
To an extend, pregnancies are also a factor. Higher the number, people are in diabetic. This might be because pregancies can have relationship with age as well
#since max and min and values are different for different columns, before building an ML model,scaling is important
#we are scaling using standard scaler library provided by sk learn
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = pd.DataFrame(sc_X.fit_transform(dataFrame_copy.drop(["Outcome","AgeGroup"],axis = 1),),
columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin','BMI', 'Age','DiabetesPedigreeFunction'])
y = dataFrame_copy.Outcome
#splitting the data to training set and test set. in order to make sure that the set split happens properly i.e.
#both classes get equally represented in both training an test sets we are enabling stratifify function as the outcome column
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=1/3,random_state=2, stratify=y)
#LogisticRegression
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#fiting the model
model.fit(X_train,y_train)
#prediction
prediction = model.predict(X_test)
from sklearn import metrics
#Accuracy
print("Accuracy ",metrics.accuracy_score(y_test, prediction)*100)
#Descition Tree
DT = DecisionTreeClassifier()
#fiting the model
DT.fit(X_train, y_train)
#prediction
y_pred = DT.predict(X_test)
#Accuracy
print("Accuracy ", DT.score(X_test, y_test)*100)
#Gradient Boosting
model = GradientBoostingClassifier()
#fiting the model
model.fit(X_train, y_train)
#prediction
y_pred = model.predict(X_test)
#Accuracy
print("Accuracy ", model.score(X_test, y_test)*100)
#building confusion matrix
sns.set(font_scale=1.5)
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot=True, fmt='g')
plt.show()
#KNN Classifier
from sklearn.neighbors import KNeighborsClassifier
test_scores = []
train_scores = []
for i in range(1,15):
knn = KNeighborsClassifier(i)
knn.fit(X_train,y_train)
train_scores.append(knn.score(X_train,y_train))
test_scores.append(knn.score(X_test,y_test))
maximumscore = max(test_scores)
test_scores_index = [i for i, v in enumerate(test_scores) if v == maximumscore]
print('k = {}'.format(list(map(lambda x: x+1, test_scores_index))))
#taking knn recommended value for classification
knn = KNeighborsClassifier(13)
#fitting
knn.fit(X_train,y_train)
print("Accuracy ", knn.score(X_test, y_test)*100)